This notebook explores the use of SingleR to perform cell-type annotation on datasets from the ScPCA project SCPCP000007 (Gawad lab data).

Note: The first time you run this code it may take a few more minutes due to reference downloads, but they will be cached for faster future execution.

0.1 Set Up

# load the R project
project_root <- here::here()
renv::load(project_root)

suppressPackageStartupMessages({
  library(SingleCellExperiment)
  library(SingleR)
  library(celldex)
  library(ggplot2)
})
theme_set(theme_bw())
set.seed(params$seed) # unclear if this is doing anything? probably not. but maybe later!

utils_dir <- file.path(project_root, "scripts", "utils")

# source the helper functions to grab the integration method check
source(file.path(utils_dir, "integration-helpers.R"))

gawad_project_id <- "SCPCP000007"
sce_file_suffix <- "processed_citeseq.rds"

Read in the data:

library_metadata_df <- readr::read_tsv(params$library_file)
## Rows: 25 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (15): project_name, submitter, library_biomaterial_id, sample_biomateria...
## lgl  (1): has_CITEseq
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Define the SCE filenames
sce_file_paths <- library_metadata_df %>%
  dplyr::filter(project_name == gawad_project_id) %>%
  dplyr::mutate(sce_file_path = file.path(
    integration_input_dir, sample_biomaterial_id, glue::glue("{library_biomaterial_id}_{sce_file_suffix}")
  )) %>%
  dplyr::pull(sce_file_path)

0.2 Initial exploration

Currently the code chunks in this section are not evaluated, but remain here for now for posterity.

To build an initial sense of what we can expect annotating with SingleR, let’s just look at one SCE (arbitrarily chosen as the first):

# For now let's just explore one!
sce_file <- sce_file_paths[[1]]

sce <- readr::read_rds(sce_file)
sce

The celldex package contains bulk RNA-Seq datasets for use as reference. Since this is AML data, we’ll want a reference that has a lot of blood information. According to the celldex, the Human Primary Cell Atlas data will be our closest match.

The HPCA reference consists of publicly available microarray datasets derived from human primary cells (Mabbott et al. 2013). Most of the labels refer to blood subpopulations but cell types from other tissues are also available.

# define reference with ensembl IDs to match our IDs
ref_se <- celldex::HumanPrimaryCellAtlasData(ensembl = TRUE)

# Predict cell types
#  If we change to `labels = ref_se$label.fine`, we'll get more 
#  fine-grained annotations with subtypes etc.
preds_hpca <- SingleR::SingleR(
  # dataset we want to annotate
  test = sce, 
  # reference dataset
  ref = ref_se,
  # label.main is broad 
  labels = ref_se$label.main)


# Results into a tibble:
preds_df <- tibble::as_tibble(preds_hpca$labels) %>%
  dplyr::rename(celltype = value) %>%
  dplyr::mutate(cell_barcode = rownames(preds_hpca),
                reference = "hpca")

# Save the celltypes:
hpca_celltypes <- unique(ref_se$label.main)

And a heatmap version, showing all celltypes. The bar the top shows the final assignment, which are the rows with highest scores.

SingleR::plotScoreHeatmap(preds_hpca)

But can’t hurt to see how this compares to the Blueprint/ENCODE reference:

The Blueprint/ENCODE reference consists of bulk RNA-seq data for pure stroma and immune cells generated by Blueprint (Martens and Stunnenberg 2013) and ENCODE projects (The ENCODE Project Consortium 2012).

# define reference with ensembl IDs to match our IDs
ref_se <- celldex::BlueprintEncodeData(ensembl = TRUE)

# Predict cell types, broadly
preds_blue_enc <- SingleR::SingleR(
  # dataset we want to annotate
  test = sce, 
  # reference dataset
  ref = ref_se,
  # label.main is broad 
  labels = ref_se$label.main)


# Results into a tibble:
preds_df <- preds_df %>%
  dplyr::bind_rows(
    tibble::as_tibble(preds_blue_enc$labels) %>%
    dplyr::rename(celltype = value) %>%
    dplyr::mutate(cell_barcode = rownames(preds_blue_enc),
                  reference = "blueprint_encode")
  )

# Save the celltypes:
blueprint_celltypes <- unique(ref_se$label.main)

# And the heatmap:
SingleR::plotScoreHeatmap(preds_blue_enc)

What did the references predict?

# How many of each celltype?
preds_df %>%
  dplyr::filter(reference == "hpca") %>%
  dplyr::count(celltype) %>%
  dplyr::arrange(-n)


preds_df %>%
  dplyr::filter(reference == "blueprint_encode") %>%
  dplyr::count(celltype) %>%
  dplyr::arrange(-n)

We’ll have to harmonize the celltype names between references to do a robust comparison, but from a very quick glance, overlap is thinner than one might like. That said, we don’t necessarily expect Blueprint/ENCODE to do particularly well anyways!

For a clearer comparison, we’ll harmonize predicted celltypes, but let’s just focus on exactly matching celltypes as a start -

# Let's see what we have here:
sort(hpca_celltypes)
sort(blueprint_celltypes)

# Manually compiled data rame of celltypes roughly present in BOTH references
#  for now leaving the Pre/Pro B-cells out
shared_celltypes_df <- tibble::tribble(
  ~hpca_celltype, ~blueprint_celltype,
  # Both references have it:
  "Astrocyte", "Astrocytes",
  "B_cell", "B-cells",
  "Chondrocytes","Chondrocytes",
  "DC", "DC", 
  "Endothelial_cells", "Endothelial cells", 
  "Epithelial_cells", "Epithelial cells", 
  "Fibroblasts", "Fibroblasts", 
  "Keratinocytes", "Keratinocytes", 
  "Macrophage", "Macrophages", 
  "Monocyte", "Monocytes", 
  "Neurons", "Neurons", 
  "Neutrophils", "Neutrophils", 
  "NK_cell", "NK cells", 
  "Smooth_muscle_cells", "Smooth muscle", 
  # two groups - collapse back to overall T cells
  "T_cells", "CD8+ T-cells",
  "T_cells", "CD4+ T-cells",
  # two groups of HSCs, again, collapse back to overall
  "HSC_-G-CSF", "HSC", 
  "HSC_CD34+", "HSC"
) %>%
  dplyr::mutate(harmonized_celltype = ifelse(blueprint_celltype == "HSC",
                                             blueprint_celltype, 
                                             hpca_celltype))

harmonize_celltypes <- function(preds_df, reference_name) {
  if (reference_name == "hpca") {
    shared_celltypes_colname <- rlang::sym("hpca_celltype")
  } else {
    shared_celltypes_colname <- rlang::sym("blueprint_celltype")
  }
  
  filtered_preds_df <- preds_df %>%
    dplyr::filter(reference == reference_name) 
  
  sym_reference_name <- rlang::sym(reference_name)
  filtered_preds_df %>%
    dplyr::inner_join(
      dplyr::select(
        shared_celltypes_df, 
        celltype = {{shared_celltypes_colname}},
        harmonized_celltype
      )
    ) %>% 
    dplyr::select(cell_barcode, {{sym_reference_name}} := harmonized_celltype) %>%
    dplyr::right_join(filtered_preds_df) %>%
    dplyr::mutate({{reference_name}} := dplyr::if_else(
      {{sym_reference_name}} %in% shared_celltypes_df$harmonized_celltype,
      {{sym_reference_name}},
      celltype
    )) %>% 
    dplyr::select(-celltype, -reference)
  }
  
preds_df_harmonized <- harmonize_celltypes(preds_df, "blueprint_encode") %>%
  dplyr::left_join(
    harmonize_celltypes(preds_df, "hpca")  
  )

# How often do they match?

preds_df_harmonized %>%
  dplyr::summarize(percent_same = sum(blueprint_encode == hpca) / dplyr::n() )


# Where do they not match?
preds_df_harmonized %>%
  dplyr::filter(hpca != blueprint_encode)
  
# Are there specific combinations that get differently annotated?
preds_df_harmonized %>%
  dplyr::filter(hpca != blueprint_encode) %>%
  dplyr::select(-cell_barcode) %>%
  dplyr::count(blueprint_encode, hpca) %>%
  dplyr::arrange(-n)

0.3 Let’s get to it!

Now, let’s do cell type annotation with the given reference in params$reference on each of the Gawad libraries and look at their UMAPs colored by celltype.

if (params$reference == "hpca") {
  ref_data <- celldex::HumanPrimaryCellAtlasData(ensembl = TRUE)
} else if (params$reference == "blueprint_encode") {
  ref_data <- celldex::BlueprintEncodeData(ensembl = TRUE)
} else {
  stop("Bad reference parameter; either 'hpca' or 'blueprint_encode'.")
}
## snapshotDate(): 2022-04-26
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## snapshotDate(): 2022-04-25
## loading from cache
## require("ensembldb")
## Warning: Unable to map 1470 of 19363 requested IDs.
annotate_SingleR <- function(sce, ref_data) {
  # return updated sce and the predictions themselves
  
  preds <- SingleR::SingleR(
    test = sce, 
    ref = ref_data,
    labels = ref_data$label.main
  )
  
  # Add `pruned.labels`, where low-confidence annotations are NA, to sce
   sce$SingleR_annotations <- preds$pruned.labels
   
   return(
     list(
       sce = sce,
       preds = preds
       )
   )
}

plot_SingleR <- function(annotation_output) {
  # annotation_output: list of sce and preds
  # Make a heatmap and UMAP and print out
  
  heatmap <- SingleR::plotScoreHeatmap(annotation_output[["preds"]], 
                                       # default but let's be explicit
                                       show.pruned = FALSE)

  
  umap_df <- tibble::as_tibble(reducedDim(annotation_output[["sce"]], "UMAP")) %>%
    dplyr::select(UMAP1 = V1, UMAP2 = V2) %>%
    dplyr::mutate(celltypes = annotation_output[["sce"]]$SingleR_annotations)
  
  # We would probably like a more principled approach to colors that is
  #  actually based on biological information about celltypes
  plot_colors <- rainbow( length(unique(umap_df$celltypes[!(is.na(umap_df$celltypes))])))
  
  umap <- ggplot(umap_df) + 
    aes(x = UMAP1, y = UMAP2, color = celltypes) +
    geom_point(size = 0.2, alpha = 0.5) + 
    scale_color_manual(values = plot_colors)
  
  print(heatmap)
  print(umap)
  
}

# Function to run and plot SingleR
# Return SCE with annotations `SingleR_annotations` column
run_SingleR <- function(sce, viz = TRUE) {
  # Print out the library
  print(unique( metadata(sce)$library ))
  
  # Run and plot (if TRUE) annotations
  anno <- annotate_SingleR(sce, ref_data)
  if (viz) {
    # there is no interesting color palette harmonization among library colors here!
    plot_SingleR(anno)
  }
  return(anno[["sce"]])
}

Here we go!

# Read in all SCE files
sce_list <- purrr::map(
  sce_file_paths, 
  readr::read_rds
)

# Annotate them all, popping out some viz along the way
singler_sce_list <- purrr::map(sce_list, run_SingleR, viz = TRUE)
## [1] "SCPCL000295"
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

## [1] "SCPCL000296"

## [1] "SCPCL000297"

## [1] "SCPCL000298"

## [1] "SCPCL000299"

Now let’s plot, to start, the fastMNN UMAP but with celltype annotations from individual libraries. As is deeply commented all over the place, we definitely need to infuse palettes with biology!

# helper to count the number of celltypes across all libraries
count_celltypes <- function(sce) {
 tibble::tibble(celltype = colData(sce)$SingleR_annotations ) %>%
    dplyr::count(celltype) %>%
    dplyr::arrange(-n) %>%
    tidyr::drop_na() 
}

# Celltype order based on **overall** counts for all pooled annotations 
celltype_order <- purrr::map_df(singler_sce_list, count_celltypes) %>%
  dplyr::group_by(celltype) %>%
  dplyr::summarize(celltype_counts = sum(n)) %>%
  dplyr::arrange(-celltype_counts) %>%
  dplyr::pull(celltype)


# Again we'd eventually like some biology to go into the color choices!
cell_colors <- rainbow(length(celltype_order))

# Read in an integrated SCE just to start getting a sense
fastmnn <- readr::read_rds(file.path("results", "scpca", "integrated_sce", 
                                     paste0(gawad_project_id, "_integrated_fastmnn_sce.rds" ))
)


# Find all the celltype annotations for a combined color palette:
extract_celltypes <- function(sce) {
 tibble::tibble(batch = unique(metadata(sce)$library),
                celltype = colData(sce)$SingleR_annotations, 
                cell_barcode = rownames(colData(sce))) %>%
    dplyr::mutate(cell_name = paste(cell_barcode, batch, sep = "-"))
}

# fastMNN UMAP with individual library cell annotations colored
purrr::map_df(singler_sce_list, extract_celltypes) %>%
  dplyr::inner_join(
    tibble::as_tibble(colData(fastmnn), rownames = "cell_name")
  ) %>%
  dplyr::bind_cols(
    as.data.frame(reducedDim(fastmnn, "fastmnn_UMAP"))
  ) %>%
  dplyr::mutate(UMAP1 = V1, UMAP2 = V2) %>%
  # And let's make a UMAP!
  ggplot() +
  aes(x = UMAP1, y = UMAP2, color = celltype) + 
  geom_point(size = 0.2, alpha = 0.3) + 
  # we definitely want some biology here!
  scale_color_manual(values = cell_colors)
## Joining, by = c("batch", "cell_name")

0.4 Session Info

sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices datasets  utils     methods  
## [8] base     
## 
## other attached packages:
##  [1] ensembldb_2.20.2            AnnotationFilter_1.20.0    
##  [3] GenomicFeatures_1.48.3      AnnotationDbi_1.58.0       
##  [5] magrittr_2.0.3              ggplot2_3.3.6              
##  [7] celldex_1.6.0               SingleR_1.10.0             
##  [9] SingleCellExperiment_1.18.0 SummarizedExperiment_1.26.1
## [11] Biobase_2.56.0              GenomicRanges_1.48.0       
## [13] GenomeInfoDb_1.32.3         IRanges_2.30.0             
## [15] S4Vectors_0.34.0            BiocGenerics_0.42.0        
## [17] MatrixGenerics_1.8.1        matrixStats_0.62.0         
## 
## loaded via a namespace (and not attached):
##   [1] AnnotationHub_3.4.0           BiocFileCache_2.4.0          
##   [3] lazyeval_0.2.2                splines_4.2.1                
##   [5] BiocParallel_1.30.3           digest_0.6.29                
##   [7] htmltools_0.5.3               viridis_0.6.2                
##   [9] fansi_1.0.3                   memoise_2.0.1                
##  [11] ScaledMatrix_1.4.0            tzdb_0.3.0                   
##  [13] Biostrings_2.64.0             miQC_1.4.0                   
##  [15] readr_2.1.2                   vroom_1.5.7                  
##  [17] prettyunits_1.1.1             colorspace_2.0-3             
##  [19] blob_1.2.3                    rappdirs_0.3.3               
##  [21] xfun_0.32                     dplyr_1.0.9                  
##  [23] crayon_1.5.1                  RCurl_1.98-1.8               
##  [25] jsonlite_1.8.0                glue_1.6.2                   
##  [27] gtable_0.3.0                  zlibbioc_1.42.0              
##  [29] XVector_0.36.0                DelayedArray_0.22.0          
##  [31] BiocSingular_1.12.0           scales_1.2.0                 
##  [33] pheatmap_1.0.12               DBI_1.1.3                    
##  [35] Rcpp_1.0.9                    viridisLite_0.4.0            
##  [37] xtable_1.8-4                  progress_1.2.2               
##  [39] bit_4.0.4                     rsvd_1.0.5                   
##  [41] httr_1.4.3                    RColorBrewer_1.1-3           
##  [43] ellipsis_0.3.2                modeltools_0.2-23            
##  [45] farver_2.1.1                  pkgconfig_2.0.3              
##  [47] XML_3.99-0.10                 flexmix_2.3-18               
##  [49] nnet_7.3-17                   sass_0.4.2                   
##  [51] dbplyr_2.2.1                  utf8_1.2.2                   
##  [53] here_1.0.1                    labeling_0.4.2               
##  [55] tidyselect_1.1.2              rlang_1.0.4                  
##  [57] later_1.3.0                   munsell_0.5.0                
##  [59] BiocVersion_3.15.2            tools_4.2.1                  
##  [61] cachem_1.0.6                  cli_3.3.0                    
##  [63] generics_0.1.3                RSQLite_2.2.15               
##  [65] ExperimentHub_2.4.0           evaluate_0.16                
##  [67] stringr_1.4.0                 fastmap_1.1.0                
##  [69] yaml_2.3.5                    knitr_1.39                   
##  [71] bit64_4.0.5                   purrr_0.3.4                  
##  [73] KEGGREST_1.36.3               sparseMatrixStats_1.8.0      
##  [75] mime_0.12                     xml2_1.3.3                   
##  [77] biomaRt_2.52.0                compiler_4.2.1               
##  [79] rstudioapi_0.13               filelock_1.0.2               
##  [81] curl_4.3.2                    png_0.1-7                    
##  [83] interactiveDisplayBase_1.34.0 tibble_3.1.8                 
##  [85] bslib_0.4.0                   stringi_1.7.8                
##  [87] highr_0.9                     lattice_0.20-45              
##  [89] ProtGenerics_1.28.0           Matrix_1.4-1                 
##  [91] vctrs_0.4.1                   pillar_1.8.0                 
##  [93] lifecycle_1.0.1               BiocManager_1.30.18          
##  [95] jquerylib_0.1.4               BiocNeighbors_1.14.0         
##  [97] bitops_1.0-7                  irlba_2.3.5                  
##  [99] httpuv_1.6.5                  rtracklayer_1.56.1           
## [101] R6_2.5.1                      BiocIO_1.6.0                 
## [103] promises_1.2.0.1              renv_0.15.5                  
## [105] gridExtra_2.3                 codetools_0.2-18             
## [107] assertthat_0.2.1              rprojroot_2.0.3              
## [109] rjson_0.2.21                  withr_2.5.0                  
## [111] GenomicAlignments_1.32.1      Rsamtools_2.12.0             
## [113] GenomeInfoDbData_1.2.8        parallel_4.2.1               
## [115] hms_1.1.1                     grid_4.2.1                   
## [117] beachmat_2.12.0               tidyr_1.2.0                  
## [119] rmarkdown_2.14                DelayedMatrixStats_1.18.0    
## [121] shiny_1.7.2                   restfulr_0.0.15